The E. coli scheme from Enterobase [1] was used with chewBBACA [2] to get the cgMLST profile for each isolate. The number of missing loci in total (sum of LNF, NIPH, PLOT, ALM and ASM from chewBBACA) is plotted below for all isolates.
# Vector of excluded samples
ex_samples <- c(
"2016-17-565-1-S",
"2016-02-415-2-S",
"2016-02-428-2-S",
"2016-02-486-2-S",
"2016-02-732-2-S",
"2014-01-1741-1-S"
)
# Collapses data frame to unique lines while ignoring NA's
func_paste <- function(x) paste(unique(x[!is.na(x)]), collapse = ", ")
"%not_in%" <- Negate("%in%")
# Isolate data
id_data <- read.table("../data/id.txt",
sep = "\t",
header = TRUE,
stringsAsFactors = F)
isolate_data <- read.table("../data/isolate_data.txt",
sep = "\t",
header = T,
stringsAsFactors = F) %>%
filter(id %not_in% ex_samples)
# cgMLST data
cgmlst_stats <- read.table("../data/chewbbaca/complete_results/mdata_stats.tsv",
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE) %>%
filter(FILE != "2016-02-415_S208_pilon_spades.fasta")
detailed_cgmlst_stats <- read.table("../data/chewbbaca/results_statistics.tsv",
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE) %>%
filter(Genome != "2016-02-415_S208_pilon_spades.fasta")
cgMLST_allele_data <- read.table("../data/chewbbaca/complete_results/cgMLST.tsv",
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(FILE = sub("_pilon_spades.fasta", "", FILE)) %>%
dplyr::rename("ref" = FILE) %>%
left_join(id_data, by = "ref") %>%
select(id, everything(), -ref) %>%
filter(id %not_in% ex_samples)boxplot(cgmlst_stats$number.of.missing.data,
ylab = "Number of missing loci")Below follows the detailed results for all loci in the cgMLST scheme. EXC = excact matches, INF = novel allele, LNF = locus not found, NIPH = paralogous locus, PLOT = possible loci on tip of contig, ALM = allele 20 % larger than reference, ASM = allele 20 % smaller than reference. The numbers below each boxplot denote the mean.
test <- detailed_cgmlst_stats %>%
gather(metric, value, -Genome) %>%
group_by(metric) %>%
mutate(mean_val = round(mean(value), 1))
ggplot(test, aes(metric, value)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot() +
geom_text(data = subset(test,
Genome == "2006-01-2184_S30_pilon_spades.fasta"),
aes(label = mean_val),
y = -60,
size = 3.8) +
scale_x_discrete(limits = c("EXC",
"INF",
"LNF",
"NIPH",
"PLOT",
"ALM",
"ASM")) +
labs(y = "Number of loci") +
theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank())Genetic distances were calculated from the allele data from the cgMLST report, and a dendrogram was drawn from these distances. The annotations are based on data from the ARIBA resistance gene analysis.
cgMLST_data <- read.table("../data/chewbbaca/complete_results/clean_cgMLST_data.txt",
sep = "\t",
header = TRUE,
colClasses = "factor")
total_tree <- as.phylo(hclust(daisy(cgMLST_data, metric = "gower"), method = "average"))
tree_metadata <- read.table("../data/chewbbaca/complete_results/tree_metadata.txt",
sep = "\t",
header = TRUE)
tree_heatmap_data <- read.table("../data/chewbbaca/complete_results/tree_heatmap_data.txt",
sep = "\t",
header = TRUE) %>%
select(-gadX)
names(tree_heatmap_data) <- c("GyrA","ParC","ParE","RpoB","MarA","MarR","SoxR","qepA4","qnr")
palette <- c("Broiler" = "#4575b4",
"Pig" = "#74add1",
"Red fox" = "#f46d43",
"Wild bird" = "#fdae61")
palette2 <- c("1-A" = "#fc8d59",
"1-I" = "#80b1d3",
"0" = "grey95")
total_tree_annotated <- ggtree(total_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(aes(color = species),
size = 3) +
geom_tiplab2(aes(label = ST),
offset = 0.01,
size = 4) +
scale_color_manual(values = palette)
total_tree_opened <- rotate_tree(open_tree(total_tree_annotated, 8), 93)
gheatmap(total_tree_opened,
tree_heatmap_data,
offset = 0.04,
width = 0.3,
colnames_offset_y = 2.5,
colnames_position = "top",
font.size = 5) +
scale_fill_manual(values = palette2,
labels = c("Gene/Mutation absent",
"Acquired gene present",
"Intrinsic gene mutated")) +
theme(legend.position = c(0.9,0.9),
legend.text = element_text(size = 20))metadata_species <- read.table("../data/chewbbaca/complete_results/metadata_species_specific_trees.txt",
sep = "\t",
header = TRUE) %>%
select(gyrA, parC, parE, PMQR) %>%
dplyr::rename("GyrA" = gyrA,
"ParC" = parC,
"ParE" = parE)
palette3 <- c("0" = "grey95",
# gyrA
"S83L" = "#c6dbef",
"S83A" = "#9ecae1",
"D87N" = "#6baed6",
"D87H" = "#4292c6",
"D87Y" = "#2171b5",
"D87G" = "#08519c",
"S83L, D87N" = "#08306b",
# parC
"S80I" = "#a1d99b",
"S80R" = "#74c476",
"S57T" = "#41ab5d",
"A56T, S80I" = "#238b45",
"S80I, E84V" = "#005a32",
# parE
"D475E" = "#fdd0a2",
"D463N" = "#fdae6b",
"S458A" = "#fd8d3c",
"L416F" = "#f16913",
"H516Y" = "#d94801",
"L488M, A512T" = "#8c2d04",
# PMQR
"qnrA1" = "#dadaeb",
"qnrB19" = "#bcbddc",
"qnrS1" = "#9e9ac8",
"qnrS2" = "#807dba",
"qnrS4" = "#6a51a3",
"qepA4" = "#4a1486")
# Broiler tree
broiler_tree <- calc_tree("../data/chewbbaca/complete_results/cgMLST_broiler.tsv")
broiler_tree_annotated <- ggtree(broiler_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(color = "#4575b4",
size = 4) +
geom_tiplab2(aes(label = ST),
offset = 0.02,
size = 6)
broiler_complete <- gheatmap(broiler_tree_annotated,
metadata_species,
offset = 0.1,
width = 0.5,
colnames = FALSE) +
scale_fill_manual(values = palette3) +
guides(fill = FALSE)
# Pig tree
pig_tree <- calc_tree("../data/chewbbaca/complete_results/cgMLST_pig.tsv")
pig_tree_annotated <- ggtree(pig_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(color = "#74add1",
size = 4) +
geom_tiplab2(aes(label = ST),
offset = 0.02,
size = 5.5)
pig_complete <- gheatmap(pig_tree_annotated,
metadata_species,
offset = 0.09,
width = 0.5,
colnames = FALSE) +
scale_fill_manual(values = palette3) +
guides(fill = FALSE)
# Red fox tree
fox_tree <- calc_tree("../data/chewbbaca/complete_results/cgMLST_fox.tsv")
fox_tree_annotated <- ggtree(fox_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(color = "#f46d43",
size = 4) +
geom_tiplab2(aes(label = ST),
offset = 0.02,
size = 5.5)
fox_complete <- gheatmap(fox_tree_annotated,
metadata_species,
offset = 0.09,
width = 0.5,
colnames = FALSE) +
scale_fill_manual(values = palette3) +
guides(fill = FALSE)
# Wild bird tree
bird_tree <- calc_tree("../data/chewbbaca/complete_results/cgMLST_bird.tsv")
bird_tree_annotated <- ggtree(bird_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(color = "#fdae61",
size = 4) +
geom_tiplab2(aes(label = ST),
offset = 0.02,
size = 5.5)
bird_complete <- gheatmap(bird_tree_annotated,
metadata_species,
offset = 0.09,
width = 0.5,
colnames = FALSE) +
scale_fill_manual(values = palette3) +
guides(fill = FALSE)
# Total plot
tot_plot <- plot_grid(broiler_complete,
pig_complete,
fox_complete,
bird_complete,
nrow = 2,
ncol = 2,
align = "h",
labels = c("Broiler","Pig","Red fox","Wild bird"))
include_graphics("../figures/species_dendrogram2.png")A minimum spanning tree calculated from the cgMLST data with GrapeTree [3] using MSTreeV2. Number of allele differences are denoted as numbers on branches between nodes. The branches have been collapsed on ten differences or less. The number inside each node is the sequence type based on the Warwick 7-gene MLST scheme [4].
include_graphics("../figures/GrapeTree_cgMLST.png")Below follows a plot of the distribution of the lower triangular distances for each animal species. A bimodal distribution is present in all four plots. The filtering values of < 0.1 and > 0.8 is used for downstream statistics to acertain whether the broiler isolates are more homogenous than the isolates from other species.
# calculates distances from cgMLST data and returns
# the lower triangular of the matrix, with (diag = FALSE)
# or without (diag = TRUE) the diagonal
lower_tri_dist_calc <- function(data, diag = TRUE, data_frame = FALSE) {
dist <- as.matrix(daisy(data, metric = "gower"))
if (diag == TRUE) {
dist[upper.tri(dist, diag = TRUE)] <- NA
} else {
dist[upper.tri(dist, diag = FALSE)] <- NA
}
dist_vec <- as.vector(as.matrix(dist))
dist_compl <- dist_vec[!is.na(dist_vec)]
if (data_frame == TRUE) {
dist_compl <- as.data.frame(as.matrix(dist))
}
return(dist_compl)
}
broiler_cgMLST <- read.table("../data/chewbbaca/complete_results/cgMLST_broiler.tsv",
sep = "\t",
header = TRUE,
colClasses = "factor",
row.names = 1)
pig_cgMLST <- read.table("../data/chewbbaca/complete_results/cgMLST_pig.tsv",
sep = "\t",
header = TRUE,
colClasses = "factor",
row.names = 1)
fox_cgMLST <- read.table("../data/chewbbaca/complete_results/cgMLST_fox.tsv",
sep = "\t",
header = TRUE,
colClasses = "factor",
row.names = 1)
bird_cgMLST <- read.table("../data/chewbbaca/complete_results/cgMLST_bird.tsv",
sep = "\t",
header = TRUE,
colClasses = "factor",
row.names = 1)
broiler_dist <- lower_tri_dist_calc(broiler_cgMLST)
pig_dist <- lower_tri_dist_calc(pig_cgMLST)
fox_dist <- lower_tri_dist_calc(fox_cgMLST)
bird_dist <- lower_tri_dist_calc(bird_cgMLST)
# Distance plots
par(mfrow = c(2,2))
x1 <- hist(broiler_dist,
breaks = 20,
plot = FALSE)
x1$density = x1$counts/sum(x1$counts)*100
plot(x1,
freq = FALSE,
main = "Broiler distance frequency",
xlab = "Distance",
ylim = c(0, 50))
x2 <- hist(pig_dist,
breaks = 20,
plot = FALSE)
x2$density = x2$counts/sum(x2$counts)*100
plot(x2,
freq = FALSE,
main = "Pig distance frequency",
xlab = "Distance",
ylim = c(0, 50))
x3 <- hist(fox_dist,
breaks = 20,
plot = FALSE)
x3$density = x3$counts/sum(x3$counts)*100
plot(x3,
freq = FALSE,
main = "Red fox distance frequency",
xlab = "Distance",
ylim = c(0, 50))
x4 <- hist(bird_dist,
breaks = 20,
plot = FALSE)
x4$density = x4$counts/sum(x4$counts)*100
plot(x4,
freq = FALSE,
main = "Wild bird distance frequency",
xlab = "Distance",
ylim = c(0, 50))Distribution of percentages of distances < 0.1 or > 0.8. The distribution is based on 1000 iterations of the distance data. Observed values are denoted as arrows for each animal species; dark blue = Broiler, light blue = Pig, red = Red fox, yellow = Wild bird. The mean value is denoted as a vertical black line.
# generate data frame from allele data
alleles <- cgMLST_allele_data %>%
left_join(isolate_data[,c("id","species")], by = "id") %>%
group_by(species) %>%
mutate(n = 1:n(),
new_id = case_when(species == "Broiler" ~ paste("BR", n, sep = "_"),
species == "Pig" ~ paste("PI", n, sep = "_"),
species == "Red fox" ~ paste("RF", n, sep = "_"),
species == "Wild bird" ~ paste("WB", n, sep = "_"))) %>%
ungroup() %>%
select(new_id, everything(), -c(species, id)) %>%
mutate_all(funs(as.factor)) %>%
column_to_rownames("new_id")
# generate data frame for iteration analysis
run_df <- lower_tri_dist_calc(alleles, data_frame = TRUE) %>%
rownames_to_column("id") %>%
gather(isolate1, value, -id) %>%
dplyr::rename("isolate2" = id) %>%
mutate(group1 = substr(isolate1, start = 1, stop = 2),
group2 = substr(isolate2, start = 1, stop = 2)) %>%
select(isolate1, group1, isolate2, group2, value) %>%
filter(is.na(value) == FALSE,
group1 == group2) %>%
arrange(group1, group2)
# functions used in iteration analyses
## Calculates the sum of observations lower than dist_value1 and
## bigger than dist_value2, and calculates the percentage of
## observed values for each group
without <- function(df, run, dist_value1 = 0.1, dist_value2 = 0.8) {
df <- df %>%
group_by(group1, group2) %>%
mutate(n = value < dist_value1 | value > dist_value2) %>%
group_by(group1, group2, n) %>%
count() %>%
ungroup() %>%
mutate(n = ifelse(n == TRUE, "Within", "Without")) %>%
spread(n, nn) %>%
mutate(Percent = round(Within / (Within + Without) * 100, 2),
iter = run)
return(df)
}
## Calculates the sum of observations between dist_value1 and dist_value2
## and calculates the percentage of observed values for each group
within <- function(df, run, dist_value1 = 0.1, dist_value2 = 0.8) {
df <- df %>%
group_by(group1, group2) %>%
mutate(n = value > dist_value1 & value < dist_value2) %>%
group_by(group1, group2, n) %>%
count() %>%
ungroup() %>%
mutate(n = ifelse(n == TRUE, "Within", "Without")) %>%
spread(n, nn) %>%
mutate(Percent = round(Within / (Within + Without) * 100, 2),
iter = run)
return(df)
}
## Calculates the sum of observations below dist_value1
## for each group
below_calc <- function(df, run, dist_value1 = 0.1) {
df <- df %>%
group_by(group1, group2) %>%
mutate(n = value < dist_value1) %>%
group_by(group1, group2, n) %>%
count() %>%
ungroup() %>%
mutate(n = ifelse(n == TRUE, "Below", "Above")) %>%
spread(n, nn) %>%
mutate(Below = ifelse(is.na(Below) == TRUE, 0, Below)) %>%
mutate(Percent_below = round(Below / (Above + Below) * 100, 2),
iter = run)
return(df)
}
## Calculates the sum of observations above dist_value1
## for each group
above_calc <- function(df, run, dist_value1 = 0.9) {
df <- df %>%
group_by(group1, group2) %>%
mutate(n = value > dist_value1) %>%
group_by(group1, group2, n) %>%
count() %>%
ungroup() %>%
mutate(n = ifelse(n == TRUE, "Above", "Below")) %>%
spread(n, nn) %>%
mutate(Above = ifelse(is.na(Above) == TRUE, 0, Above)) %>%
mutate(Percent_above = round(Above / (Above + Below) * 100, 2),
iter = run)
return(df)
}
## Calculates the mean of observations for each group
mean_calc <- function(df, run) {
df <- df %>%
group_by(group1, group2) %>%
mutate(metric = round(mean(value), 2)) %>%
select(group1, group2, metric) %>%
summarise_all(funs(func_paste)) %>%
mutate(iter = run,
metric = as.numeric(metric))
}
## Calculates the median of observations for each group
median_calc <- function(df, run) {
df <- df %>%
group_by(group1, group2) %>%
mutate(metric = round(median(value), 2)) %>%
select(group1, group2, metric) %>%
summarise_all(funs(func_paste)) %>%
mutate(iter = run,
metric = as.numeric(metric))
}
## Calculates the quantiles of the observations for each group
quantile_calc <- function(df, run) {
df <- df %>%
group_by(group1, group2) %>%
mutate(quant_0 = quantile(value)[1],
quant_25 = quantile(value)[2],
quant_50 = quantile(value)[3],
quant_75 = quantile(value)[4],
quant_100 = quantile(value)[5]) %>%
select(group1, group2, contains("quant")) %>%
summarise_all(funs(func_paste)) %>%
mutate_at(vars(contains("quant")),
funs(as.numeric)) %>%
mutate(iter = run)
}
# Iteration function
iterate_samples <- function(df, run, method = "mean", dist_value1, dist_value2) {
# randomize the value column, regardless of groups
df <- df %>%
mutate(value = sample(value))
# run specified function
if (method == "mean") {
df <- mean_calc(df, run)
}
if (method == "below") {
df <- below_calc(df, run, dist_value1)
}
if (method == "above") {
df <- above_calc(df, run, dist_value1)
}
if (method == "within") {
df <- within(df, run, dist_value1, dist_value2)
}
if (method == "without") {
df <- without(df, run, dist_value1, dist_value2)
}
if (method == "median") {
df <- median_calc(df, run)
}
if (method == "quantile") {
df <- quantile_calc(df, run)
}
return(df)
}
# run iterations on selected data frame
run_iterations <- function(df, runs = 1000, seed = 10, method = "mean", dist_value1, dist_value2) {
# set seed
set.seed(seed)
# create output list
output <- list()
# create expected values
if (method == "mean") {
orig <- mean_calc(df, 0)
}
if (method == "below") {
orig <- below_calc(df, 0, dist_value1)
}
if (method == "above") {
orig <- above_calc(df, 0, dist_value1)
}
if (method == "within") {
orig <- within(df, 0, dist_value1, dist_value2)
}
if (method == "without") {
orig <- without(df, 0, dist_value1, dist_value2)
}
if (method == "median") {
orig <- median_calc(df, 0)
}
if (method == "quantile") {
orig <- quantile_calc(df, 0)
}
output <- c(output, list(orig))
# run iterations
for (i in 1:runs) {
result <- iterate_samples(df,
i,
method = method,
dist_value1 = dist_value1,
dist_value2 = dist_value2)
output <- c(output, list(result))
}
return(output)
}
# Run functions
summary_stats <- run_df %>%
mutate(total_mean = round(mean(value), 3)) %>%
group_by(group1) %>%
mutate(mean = round(mean(value), 3),
var = round(var(value), 3)) %>%
select(group1, total_mean, mean, var) %>%
summarise_all(funs(func_paste))
complete_data <- run_iterations(run_df,
runs = 1000,
method = "without",
dist_value1 = 0.1,
dist_value2 = 0.8) %>%
bind_rows() %>%
mutate(Percent = as.numeric(Percent))
lower_data <- run_iterations(run_df,
runs = 1000,
method = "below",
dist_value1 = 0.1) %>%
bind_rows() %>%
mutate(Percent_below = as.numeric(Percent_below))
summary_stats# Plots
ggplot(complete_data, aes(Percent)) +
geom_histogram(aes(y = ..density..), binwidth = 0.2, color = "black",
fill = "grey80") +
stat_function(fun = dnorm,
args = with(complete_data, c(mean = mean(Percent), sd = sd(Percent))),
color = "red") +
geom_segment(aes(x = complete_data$Percent[1],
xend = complete_data$Percent[1], y = 0.08, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#4575b4") +
geom_segment(aes(x = complete_data$Percent[2],
xend = complete_data$Percent[2], y = 0.08, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#74add1") +
geom_segment(aes(x = complete_data$Percent[3],
xend = complete_data$Percent[3], y = 0.08, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#f46d43") +
geom_segment(aes(x = complete_data$Percent[4],
xend = complete_data$Percent[4], y = 0.08, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#fdae61") +
geom_text(label = "p < 0.001",
x = 83.30,
y = 0.16,
angle = 90) +
geom_vline(xintercept = mean(complete_data$Percent),
size = 1) +
labs(x = "Percent (%) distances < 0.1 or > 0.8",
y = "Density") +
ggtitle(label = "Distribution of percentages",
subtitle = "Number of iterations: 1000") +
theme(plot.title = element_text(hjust = 0))ggplot(lower_data, aes(Percent_below)) +
geom_histogram(aes(y = ..density..), binwidth = 0.2, color = "black",
fill = "grey80") +
stat_function(fun = dnorm,
args = with(lower_data, c(mean = mean(Percent_below), sd = sd(Percent_below))),
color = "red") +
geom_segment(aes(x = lower_data$Percent_below[1],
xend = lower_data$Percent_below[1], y = 0.15, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#4575b4") +
geom_segment(aes(x = lower_data$Percent_below[2],
xend = lower_data$Percent_below[2], y = 0.15, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#74add1") +
geom_segment(aes(x = lower_data$Percent_below[3],
xend = lower_data$Percent_below[3], y = 0.15, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#f46d43") +
geom_segment(aes(x = lower_data$Percent_below[4],
xend = lower_data$Percent_below[4], y = 0.15, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#fdae61") +
geom_text(label = "p < 0.001",
x = 5.87,
y = 0.32,
angle = 90) +
geom_vline(xintercept = mean(lower_data$Percent_below),
size = 1) +
labs(x = "Percent (%) distances < 0.1",
y = "Density") +
ggtitle(label = "Distribution of Percentages",
subtitle = "Number of iterations: 1000") +
theme(plot.title = element_text(hjust = 0))[1] Alikhan N-F et al. A genomic overview of the population structure of Salmonella. Casadesús J, editor. PLOS Genet [Internet]. 2018 Apr 5;14(4):e1007261. Available from: https://dx.plos.org/10.1371/journal.pgen.1007261
[2] Machado MP et al. chewBBACA: A complete suite for gene-by-gene schema creation and strain identification. Microb Genomics. 2018;1–7.
[3] Z Zhou et al. (2018) “GrapeTree: Visualization of core genomic relationships among 100,000 bacterial pathogens”, Genome Res. doi: https://doi.org/10.1101/gr.232397.117
[4] Wirth T et al. (2006) “Sex and virulence in Escherichia coli : an evolutionary perspective”, Molecular Microbiology (2006) 60 (5), 1136–1151, doi: 10.1111/j.1365-2958.2006.05172.x